Load data and set WD

load("~/Documents/Jones lab/Teaching/IMED 7280/IMED7280_2019_datavis_data.rdata")
load("~/Documents/Jones lab/Teaching/IMED 7280/IMED7280_2019_datavis_example.rdata")
setwd("~/Documents/Jones lab/Teaching/IMED 7280")
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4

Group 1: Heatmap

Tasks:

  1. Make a heatmap of the first 100 beta values using ggplot and geom_tile
bet.melt<- melt(bet[1:100,])

h1<- ggplot(bet.melt, aes(Var2, Var1, fill=value))+
  geom_tile()
h1

  1. Change default parameters to increase clarity - think themes, colours, labels, etc.
h2<- h1+theme(axis.text.x = element_text(angle = 90, hjust = 1, size=6),
            axis.text.y = element_text(size=6))+
            xlab("Sample ID")+ 
            ylab("CpG name")+
  scale_fill_gradient(low="white", high="black")
h2

  1. Reorder the X axis by Tissue Note there are at least two ways to do this! From the original data or in ggplot itself. If you have time, try both.
meta.ord<- meta[order(meta$Tissue),]
bet.melt.ord<- melt(bet[1:100,rownames(meta.ord)])

h3<- ggplot(bet.melt.ord, aes(Var2, Var1, fill=value))+
  geom_tile()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=6),
            axis.text.y = element_text(size=6))+
            xlab("Sample ID")+ 
            ylab("CpG name")+
  scale_fill_gradient(low="white", high="black")
h3

h3.1<- h2+scale_x_discrete(limits=rownames(meta.ord))
h3.1

  1. Repeat this heatmap with only sites from the U6 gene
u6.names<- cord_adult_ct_sub[cord_adult_ct_sub$Closest_TSS_gene_name=="U6",]

bet.u6.melt<- melt(bet[rownames(u6.names),])

h4<- ggplot(bet.u6.melt, aes(Var2, Var1, fill=value))+
  geom_tile()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=6),
            axis.text.y = element_text(size=6))+
            xlab("Sample ID")+ 
            ylab("CpG name")+
  scale_fill_gradient(low="white", high="black")
  
h4  

  1. ADVANCED: reorder the y axis of the top 100 probes to cluster hierarchically
bet.clust<- hclust(dist(bet[1:100,]))$order
bet.melt.clust<- melt(bet[bet.clust,])
h5<- ggplot(bet.melt.clust, aes(Var2, Var1, fill=value))+
  geom_tile()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size=6),
            axis.text.y = element_text(size=6))+
            xlab("Sample ID")+ 
            ylab("CpG name")+
  scale_fill_gradient(low="white", high="black")+
  scale_x_discrete(limits=rownames(meta.ord))
h5

Group 2: Scatterplot

Tasks

  1. Pick one Tissue type and plot the effect size E versus the pvalue P
s1<- ggplot(cord_adult_ct_sub, aes(CD4T_E, CD4T_P))+
  geom_point()
s1

  1. Turn it into a volcano plot by applying a -log10 to the Y axis
s2<- ggplot(cord_adult_ct_sub, aes(CD4T_E, -log10(CD4T_P)))+
  geom_point()
s2

  1. Change the defaults to make it more reader friendly - think themes, colours, labels, etc.(Try reducing the transparency of the points)
s3<- ggplot(cord_adult_ct_sub, aes(CD4T_E, -log10(CD4T_P)))+
  geom_point(alpha=0.2,  colour="cornflowerblue")+
  theme_bw()+
  xlab("Effect size")+
  ylab("-log10 p value")+
  xlim(c(-0.75, 0.75))+
  ggtitle("CD4T cells")
s3  

  1. Colour only those points with an effect size greater than 0.05 a different colour
cord_adult_ct_sub$large<- NA
cord_adult_ct_sub[abs(cord_adult_ct_sub$CD4T_E)>0.05, "large"]<- "Y"
cord_adult_ct_sub[is.na(cord_adult_ct_sub$large), "large"]<- "N"
s4<- ggplot(cord_adult_ct_sub, aes(CD4T_E, -log10(CD4T_P), colour=large))+
  geom_point(alpha=0.2)+
  scale_colour_manual(values=c("grey", "firebrick"))+
  theme_bw()+
  xlab("Effect size")+
  ylab("-log10 p value")+
  ggtitle("CD4T cells")+
  theme(legend.position =  "none")
s4

  1. Reshape the original data to make multiple faceted volcano plots
ct.melt<- melt(cbind(id=rownames(cord_adult_ct_sub), cord_adult_ct_sub[,1:12]))
## Using id as id variables
ct.melt$type<- NA
ct.melt[grep("E", ct.melt$variable), "type"]<- "Effect"
ct.melt[grep("P", ct.melt$variable), "type"]<- "Pvalue"
ct.melt$cell<- gsub("_.*","", ct.melt$variable)
ct.cast<- dcast(ct.melt, id+cell~type)

ggplot(ct.cast, aes(Effect, -log10(Pvalue)))+
  geom_point(alpha=0.2)+
  scale_colour_manual(values=c("grey", "firebrick"))+
  theme_bw()+
  xlab("Effect size")+
  ylab("-log10 p value")+
  ggtitle("CD4T cells")+
  theme(legend.position =  "none")+
  facet_wrap(~cell)

Group 3: Boxplot

Tasks

  1. Make a boxplot of beta values for one sample using CpG island class as a category (don’t choose an nRBC sample they are weird)
mono.toplot<- data.frame(dat=bet[,"TS194_Mo"], type=cord_adult_ct_sub$HIL_CpG_class)
b1<- ggplot(mono.toplot, aes(type, dat))+
  geom_boxplot()
b1

  1. Change default parameters to increase clarity - think themes, colours, labels, etc.
b2<- ggplot(mono.toplot, aes(type, dat, fill=type))+
  geom_boxplot()+
  theme_bw()+
  scale_fill_manual(values=c("navy", "firebrick", "goldenrod", "darkgreen"))+
  ggtitle("TS194 Monocytes")
b2

  1. Add the original data points as a layer on top of the boxes NOTE - what does boxplot do with outliers??
b3<- b2+
  geom_point(colour="grey", position="jitter", alpha=0.2)
b3

  1. Make a violin plot using the same data
b4<- ggplot(mono.toplot, aes(type, dat, fill=type))+
  geom_violin()+
  theme_bw()+
  scale_fill_manual(values=c("navy", "firebrick", "goldenrod", "darkgreen"))+
  ggtitle("TS194 Monocytes")
b4

  1. Make a faceted violin plot with all the cell types from individual “TS194”
four.toplot<- data.frame(bet[,grep("TS194",colnames(bet) )], type=cord_adult_ct_sub$HIL_CpG_class)
four.melt<- melt(data.frame(cpg=rownames(four.toplot), four.toplot))
## Using cpg, type as id variables
four.melt$cell<- gsub(".*_","", four.melt$variable)
b5<- ggplot(four.melt, aes(type, value, fill=type))+
  geom_violin()+
  theme_bw()+
  scale_fill_manual(values=c("navy", "firebrick", "goldenrod", "darkgreen"))+
  ggtitle("TS194")+
  facet_wrap(~cell)
b5

Group 4: Histogram/density plot

Tasks

  1. Make a histogram of p values for CD4T cells
h1<- ggplot(cord_adult_ct_sub, aes(CD4T_P))+
  geom_histogram()
h1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  1. Change default parameters to make it clearer
h2<- h1+ theme_bw()+
  xlab("P value distribution for CD4T cells")
h2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  1. Overlap histograms of p values for CD4T and Monocytes, and modify them to be clearer
hist.melt<- melt(cord_adult_ct_sub[,c("CD4T_P", "Mono_P")])
## No id variables; using all as measure variables
h3<- ggplot(hist.melt, aes(value, fill=variable))+
  geom_histogram(position="dodge")+
  theme_bw()+
  scale_fill_manual(values=c("cornflowerblue", "firebrick"))
h3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  1. Repeat step 3 but use density plot instead, and modify as needed to make it clear
h4<- ggplot(hist.melt, aes(value, fill=variable))+
  geom_density(alpha=0.5)+
  theme_bw()+
  scale_fill_manual(values=c("cornflowerblue", "firebrick"))
h4

  1. Make both an overlap of all the cell type density plots as well as a faceted version with one distribution per facet and decide which is best overlap first
hist.melt.all<- melt(cord_adult_ct_sub[,grep("_P", colnames(cord_adult_ct_sub))])
## No id variables; using all as measure variables
h5.1<- ggplot(hist.melt.all, aes(value, fill=variable))+
  geom_density(alpha=0.5)+
  theme_bw()+
  scale_fill_brewer(palette="Set1")
h5.1

Faceted

h5.2<- ggplot(hist.melt.all, aes(value, fill=variable))+
  geom_density()+
  theme_bw()+
  scale_fill_brewer(palette="Set1")+
  facet_wrap(~variable)
h5.2